use ${dirInput}/compustat_quarterly, clear

* Prepare various variables
* ==============================
destring gvkey sic, replace

gen date_quarterly = qofd(datadate) 
rename indfmt indfmtq
rename naics naicsq
rename sic sicq

keep if date_quarterly <= yq(2017,3)
keep if date_quarterly >= yq(1980,1)
keep if fic=="USA"
 
destring naicsq, replace
destring sicq, replace
destring gvkey, replace

format date_quarterly %tq
rename gvkey id

* Industry classification based on two-digit NAICS code
drop if naicsq==. 
gen naicsq2=.
replace naicsq2 = floor(naicsq/10000) if naicsq >= 1e5 & naicsq < 1e6
replace naicsq2 = floor(naicsq/1000) if naicsq >= 1e4 & naicsq < 1e5
replace naicsq2 = floor(naicsq/100) if naicsq >= 1e3 & naicsq < 1e4
replace naicsq2 = floor(naicsq/10) if naicsq >= 1e2 & naicsq < 1e3
replace naicsq2 = naicsq if naicsq < 1e2  & naicsq > 10

* Industry classification based on three-digit NAICS code
drop if naicsq==. 
gen naicsq3=.
replace naicsq3 = floor(naicsq/1000) if naicsq >= 1e5 & naicsq < 1e6
replace naicsq3 = floor(naicsq/100) if naicsq >= 1e4 & naicsq < 1e5
replace naicsq3 = floor(naicsq/10) if naicsq >= 1e3 & naicsq < 1e4
replace naicsq3 = naicsq if naicsq >= 1e2 & naicsq < 1e3

* Industry classification based on four-digit NAICS code
gen naicsq4=.
replace naicsq4 = floor(naicsq/100) if naicsq >= 1e5 & naicsq < 1e6
replace naicsq4 = floor(naicsq/10) if naicsq >= 1e4 & naicsq < 1e5 
replace naicsq4 = naicsq if naicsq >= 1e3 & naicsq < 1e4 

* Industry classification based on five-digit NAICS code
gen naicsq5=.
replace naicsq5 = floor(naicsq/10) if naicsq >= 1e5 & naicsq < 1e6
replace naicsq5 = naicsq if naicsq >= 1e4 & naicsq < 1e5  

* Industry classification based on five-digit NAICS code
gen naicsq6=.
replace naicsq6 = naicsq if naicsq >= 1e5 & naicsq < 1e6 

gen naicsq0 = 0 

* In case of duplicates, keep the financial format for FIRE firms and the industry format otherwise
duplicates tag id date_q, gen(dup)
drop if dup>0 & indfmt=="FS" & ~(naicsq2==52 | naicsq2 == 53)
drop if dup>0 & indfmt=="INDL" & (naicsq2==52 | naicsq2 == 53) 
drop dup

* Don't need industry format identifier anymore
drop indfmt

* Drop all remaining duplicates
duplicates tag id date_q, gen(dup)
count if dup>0
drop if dup>0
drop dup

* Panel structure
xtset id date_quarterly

* Check that all firms have a NAICS-2 code
assert naicsq2 != . 

* Data cleaning
* ===================

* Timing: ppentq/ppegtq lagged by 1 period
gen aux = L.ppegtq
replace ppegtq = aux
drop aux 

gen aux = L.ppentq
replace ppentq = aux
drop aux 

* Drop utility, public and "FIRE" sectors  
drop if naicsq2 == 99 // public administration
drop if naicsq2 == 22 // utilities
drop if (naicsq2==52 | naicsq2 == 53) // FIRE

* Replace implausible observations by missing
foreach var of varlist dpq xsgaq dlcq dlttq cheq {
	replace `var' = 0 if `var' == .
}
foreach var of varlist saleq cogsq ppentq ppegtq atq {
	replace `var' = . if `var'<=0
} 
foreach var of varlist dpq xsgaq dlcq dlttq cheq { 
	replace `var' = . if `var' < 0
}

* Replace gaps in data
* =========================
gen beforeFill = 1 

tsfill 
foreach var in ppegtq ppentq saleq cogsq xsgaq dpq atq dlcq dlttq cheq {
	
	* Interpolate
	replace `var' = (F.`var'+L.`var')/2 if `var'==. & F.`var'!=. & L.`var'!=.
	
	* Fill up NAICS code
	replace naicsq0 = L.naicsq0 if `var' != . & beforeFill == .  
	replace naicsq2 = L.naicsq2 if `var' != . & beforeFill == .  
	replace naicsq3 = L.naicsq3 if `var' != . & beforeFill == .
	replace naicsq4 = L.naicsq4 if `var' != . & beforeFill == .
	replace naicsq5 = L.naicsq5 if `var' != . & beforeFill == .
	replace naicsq6 = L.naicsq6 if `var' != . & beforeFill == . 
	
}

* If no interpolation happened, drop again
drop if beforeFill == . & naicsq2 == . 
drop beforeFill
 
* Deflate variables
* ======================
merge m:1 date_quarterly using ${dirTemp}/fred_data_quarterly, gen(_merge2) keepusing(GDPDEF GDPDEFINV) keep(1 3)

replace GDPDEF = GDPDEF / 100
replace GDPDEFINV = GDPDEFINV / 100

foreach var in saleq cogsq xsgaq invfgq invtq atq dlcq dlttq cheq rdipq xrdq dpq txtq oibdpq oiadpq {
	replace `var' = `var'/GDPDEF
}
foreach var in ppegtq ppentq {
	replace `var' = `var'/GDPDEFINV
} 

drop GDPDEF GDPDEFINV
 
* Perpetual inventory method  
* ==================================
xtset id date_q
global minInvSpell = 1

* Focus on "investment spell" of >=$minInvSpell quarters
* To this end, find the length of each "investment spell" 
gen ppentq2=ppentq
replace ppentq2=(L.ppentq+F.ppentq)/2 if ppentq==.
cap drop run 
cap drop spell 
gen run = .
bysort id: replace run = cond(L.run == ., 1, L.run + 1) if ppentq2!=.
gen newspell = 1 if run==1  
bysort id: gen spell = sum(newspell)
replace spell = . if run==.
drop newspell
bysort id spell: egen spelllength = max(run) 
drop run 
gen dumSpell = (spelllength >= $minInvSpell & ~missing(spell))
xtset id date_quar

* Find first observation of ppegtq within spell
cap drop aux* 
bysort id spell (date_quarterly): gen auxn = _n  
bysort id spell: egen aux_idx_ppegtq_first = min(auxn) if ~missing(ppegtq) 
bysort id spell: egen aux_idx_ppegtq_first2 = mean(aux_idx_ppegtq_first) 

* Perpetual inventory method series starts there
bysort id spell: gen ppeitq = ppegtq if auxn == aux_idx_ppegtq_first 

* We have to create a temporary "id" variable: id x spell, which we'll use as a unit identifier 
preserve 

drop if spell == .  
gen spacer = 999
egen idspell = concat(spell spacer id)
destring idspell, replace 
xtset idspell date_quarterly

* Core of the PIM:
bysort idspell: ///
	replace ppeitq = L.ppeitq + ppentq2 - L.ppentq2 ///
	if auxn > aux_idx_ppegtq_first2  

keep id date_quarterly ppeitq*
save ${dirTemp}/ppeitq_2, replace 

restore
	
* Merge back to data 
cap drop aux*
merge 1:1 id date_quarterly using ${dirTemp}/ppeitq_2, update gen(_merge_ppeitq)
drop if _merge_ppeitq == 2
drop _merge_ppeitq
drop spell spelllength dumSpell ppentq2
shell rm ${dirTemp}/ppeitq_2.dta 
replace ppeitq = . if ppeitq <= 0 // drop if capital negative  


* Add Gutierrez/Philippon industry-level data
* the following taken from Baqaee/Fahri (2020) who base it on Gutierrez/Philippon
* see https://academic.oup.com/qje/article-abstract/135/1/105/5573281?redirectedFrom=fulltext
* =================================================================================================
 
*** "Map codes manually to BEA categories
* Manual mapping required for 2 reasons: 
* 1. Compustat shows outdated naicsq3 codes (as compared to 2007 naicsq3)
* 2. Some categories defined by BEA group arbitrary naicsq3 codes. See below for 
* 	 rationale behind any adjustment 
* I have confirmed no code except 999 remains un-mapped"
 
gen beacode = naicsq3
replace beacode = 110 if beacode>=111 & beacode<=112 // BEA footnote
replace beacode = 113 if beacode>=113 & beacode<=115 // BEA grouping name
replace beacode = 220 if beacode==221 // direct mapping
replace beacode = 230 if beacode>=230 & beacode<=238 // direct mapping
replace beacode = 230 if beacode==23 // direct mapping
replace beacode = 311 if beacode==312 // BEA grouping name
replace beacode = 313 if beacode==314 // BEA grouping name
replace beacode = 315 if beacode==316 // BEA grouping name
replace beacode = 338 if beacode==339 // BEA grouping name
replace beacode = 420 if beacode>=420 & beacode< 430 // BEA Grouping name
replace beacode = 420 if naicsq3 == 42 // BEA Grouping name
replace beacode = 440 if beacode>=440 & beacode<=459 // BEA Grouping name
replace beacode = 487 if beacode>=487 & beacode<= 488 // BEA footnote. Note: exclude Postal service (code 491) 
replace beacode = 487 if beacode==492 // BEA footnote
replace beacode = 513 if beacode>=513 & beacode<= 517 // BEA grouping name
replace beacode = 513 if beacode>=571 & beacode<= 580 // BEA footnote
replace beacode = 514 if beacode>=518 & beacode<= 519 // BEA grouping name
replace beacode = 532 if beacode>=532 & beacode<= 533 // BEA grouping name
replace beacode = 541 if beacode>=540 & beacode< 550 // BEA grouping name
replace beacode = 550 if beacode>=550 & beacode< 560 // BEA grouping name
replace beacode = 550 if naicsq3 == 55 // BEA grouping name
replace beacode = 562 if beacode>=562 & beacode< 570 // BEA grouping name
replace beacode = 610 if beacode>=610 & beacode< 620 // BEA grouping name
replace beacode = 610 if naicsq3 == 61  // BEA grouping name
replace beacode = 622 if beacode==623 // Merged due to missing output data
replace beacode = 624 if beacode>=624 & beacode< 630 // BEA grouping name
replace beacode = 711 if beacode==712 // BEA grouping name
replace beacode = 810 if beacode>=810 & beacode< 820 // BEA grouping name

* Merge with BF/GP data 
gen year = year(dofq(date_q))
merge m:1 beacode year using ${dirTemp}/BF_industry_data, keepusing(ind_all rk a1_depk_all_bea)
drop if _merge==2

* Convert rates to quarterly
replace rk = rk/4 
replace a1_depk_all_bea = a1_depk_all_bea/4 

* Housekeeping
drop beacode year ind_all _merge
 

* Add price stickiness data 
* ==============================

* Price stickiness, 3d-level
merge m:1 naicsq3 using ${dirTemp}/stickiness_naics3, keepusing(freq3d dur3d) keep(1 3) nogen

* Price stickiness, 5d-level
merge m:1 naicsq5 using ${dirTemp}/stickiness_naics5, keepusing(freq5d dur5d) keep(1 3) nogen

* Price stickiness, imputed firm-level 
merge m:1 id using ${dirTemp}/firm_rigidity, keepusing(freq dur) keep(1 3) nogen

* Final measure
gen freq_avg3 = freq if freq3d < .
replace freq_avg3 = freq5d if freq == .

gen dur_avg3 = dur if dur3d < .
replace dur_avg3 = dur5d if freq == .

* Housekeeping
drop freq dur freq5d dur5d freq3d dur3d


* Prepare indicators for data treatments
* =============================================

* "reports_once": only one quarter of full observation in given year 
gen full_obs = 1 
foreach var of varlist saleq cogsq ppeitq {
	replace full_obs = 0 if `var' == .
}

cap gen year = year(dofq(date_q))
bysort year id: egen aux = sum(full_obs)
gen reports_once = (aux <= 1)
drop aux
drop year

* "sales_low": deflated sales below 1mln
gen sales_low = (saleq < 1)

* "excessive_growth": Sales growth > 100% or <-67%
xtset id date_q
gen aux = D.saleq/L.saleq 
gen excessive_growth = (aux > 1 & aux !=.) | aux < -.67
drop aux   

* "min_16_obs": at least 16 consecutive observations
xtset id date_q
cap drop run 
gen run = .
bysort id: replace run = cond(L.run == ., 1, L.run + 1) if full_obs 
gen newspell = 1 if run==1  
bysort id: gen spell = sum(newspell)
replace spell = . if run==.
drop newspell
bysort id spell: egen spelllength = max(run) 
gen runinv = spelllength - run + 1 
gen min_16_obs = ~(spelllength < 16)  
drop run* spell* 
 
gen no_outlier = sales_low == 0 & reports_once == 0 & excessive_growth == 0 

* Drop if industry info missing
* ===================================

* drop if firm's naics2 code missing 
drop if naicsq2==. 

* Drop if only 1 full observation in naics4 industry 
bysort date_q naicsq4: egen N = sum(full_obs) if naicsq4!=.
drop if N <= 1
drop N

* Indicators
gen baseline = (no_outlier==1)
gen lowsales = (no_outlier==1 | sales_low == 1)
gen excgro = (no_outlier==1 | excessive_growth == 1)
gen min16 = (no_outlier==1 & min_16_obs == 1)


* Save data
* ================
xtset id date_q
order id date_q
save ${dirTemp}/compustat_quarterly_prepared, replace  

use ${dirTemp}/compustat_quarterly_prepared, clear

* Prepare PF inputs
* ========================
foreach data in baseline lowsales excgro min16 {
forvalues k = 2(2)4 { 
preserve
	keep if `data' == 1

	gen date = date_quarterly
	drop if naicsq`k'==. 
	
	* Create data to be exported
	gen y = log(saleq)  
	gen k = log(ppeitq)
	gen v = log(cogsq)    

	gen Ly = L.y 
	gen Lv = L.v
	gen Lk = L.k
	gen L4v = L4.v 

	* Drop if anything missing
	foreach var in y v k Ly Lv Lk L4v {
		drop if `var'==.
	}     
	replace naicsq = naicsq`k'
	keep id naicsq date y v k Ly Lv Lk L4v
	order naicsq date y v k Ly Lv Lk L4v id
	sort naicsq id date  
	export delimited ${dirTemp}/pf_input_naicsq`k'_`data', replace 
restore
} 	
}

* Prepare for baseline with SGA included
* ==================================================
preserve
	keep if baseline == 1

	gen date = date_quarterly
	drop if naicsq2==. 
	
	* Create data to be exported
	gen y = log(saleq)  
	gen k = log(ppeitq)
	gen v = log(cogsq+xsgaq) if cogsq > 0

	gen Ly = L.y 
	gen Lv = L.v
	gen Lk = L.k
	gen L4v = L4.v 

	* Drop if anything missing
	foreach var in y v k Ly Lv Lk L4v {
		drop if `var'==.
	}     
	
	replace naicsq = naicsq2
	keep id naicsq date y v k Ly Lv Lk L4v
	order naicsq date y v k Ly Lv Lk L4v id
	sort naicsq id date  
	export delimited ${dirTemp}/pf_input_naicsq2_baseline_sga, replace 
restore

* Run PF estimation in Matlab
* =======================================
shell $matlabPath -nodesktop -r "cd('`c(pwd)'/02_prepare_micromacro_data/functions/'); estimate_pf_CD('baseline',2); exit;"
shell $matlabPath -nodesktop -r "cd('`c(pwd)'/02_prepare_micromacro_data/functions/'); estimate_pf_CD('baseline_sga',2); exit;"
shell $matlabPath -nodesktop -r "cd('`c(pwd)'/02_prepare_micromacro_data/functions/'); estimate_pf_CD('lowsales',2); exit;"
shell $matlabPath -nodesktop -r "cd('`c(pwd)'/02_prepare_micromacro_data/functions/'); estimate_pf_CD('excgro',2); exit;"
shell $matlabPath -nodesktop -r "cd('`c(pwd)'/02_prepare_micromacro_data/functions/'); estimate_pf_CD('min16',2); exit;"
shell $matlabPath -nodesktop -r "cd('`c(pwd)'/02_prepare_micromacro_data/functions/'); estimate_pf_TL('baseline',4); exit;"

* Import back to stata 
* ==============================
foreach data in baseline baseline_sga lowsales excgro min16 {
	preserve
	import delimited "${dirTemp}/pf_CD_naicsq2_`data'.csv", clear case(preserve)
	format date_q %tq
	rename naicsq naicsq2
	rename thetaCogsCD thetaCogsCD_`data'
	sort naicsq2 date_q    
	save ${dirTemp}/pf_CD_`data', replace 
	restore 
}

preserve
import delimited "${dirTemp}/pf_TL_naicsq4_baseline.csv", clear case(preserve)
rename naics naicsq4
rename betaCogsTL betaCogsTL_baseline
rename betaCogs2TL betaCogs2TL_baseline
sort naicsq4
save ${dirTemp}/pf_TL_baseline, replace 
restore 

